/* This is to clean the test score dataset */

/* Notes: download the test score files, create another excel file with two sheets
one named "grade10" and one named "grade12" and put in local disk 
* make sure to delete all test score related files in dropbox */

clear 
import excel using $testscoredata, sheet("grade12") firstrow

rename *, lower

* passing grades 
g passing_eng = ca1>=8 & th1>=32 if sub1=="ENG"
g passing_dzo = ca2>=8 & th2>=32 if sub2=="DZO"
replace passing_eng = . if missing(ca1) | missing(th1)
replace passing_dzo = . if missing(ca2) | missing(th2)

lab var passing_eng "1 = Pass BHSEC English Exam"
lab var passing_dzo "1 = Pass BHSEC Dzongkhag Exam"

* numeric grade 
g score_eng = tot1 if sub1=="ENG"
g score_dzo = tot2 if sub2=="DZO"

g score_phy = tot3 if sub3=="CHE"
g score_che = tot3 if sub3=="PHY"
g score_bmt = tot3 if sub3=="BMT"
g score_acc = tot3 if sub3=="ACC"
g score_com = tot3 if sub3=="COM"
g score_mat = tot3 if sub3=="MAT"
g score_rige = tot3 if sub3=="RIGE"  //rigzhung
g score_bio = tot3 if sub3=="BIO"
g score_his = tot3 if sub3=="HIS"
g score_geo = tot3 if sub3=="GEO"
g score_evs = tot3 if sub3=="EVS"
g score_eco = tot3 if sub3=="ECO"
g score_med = tot3 if sub3=="MED"  //media studies
g score_agfs = tot3 if sub3=="AGFS"

replace score_phy = tot4 if sub4=="CHE"
replace score_che = tot4 if sub4=="PHY"
replace score_bmt = tot4 if sub4=="BMT"
replace score_acc = tot4 if sub4=="ACC"
replace score_com = tot4 if sub4=="COM"
replace score_mat = tot4 if sub4=="MAT"
replace score_rige = tot4 if sub4=="RIGE"  //rigzhung
replace score_bio = tot4 if sub4=="BIO"
replace score_his = tot4 if sub4=="HIS"
replace score_geo = tot4 if sub4=="GEO"
replace score_evs = tot4 if sub4=="EVS"
replace score_eco = tot4 if sub4=="ECO"
replace score_med = tot4 if sub4=="MED"  //media studies
replace score_agfs = tot4 if sub4=="AGFS"

replace score_phy = tot5 if sub5=="CHE"
replace score_che = tot5 if sub5=="PHY"
replace score_bmt = tot5 if sub5=="BMT"
replace score_acc = tot5 if sub5=="ACC"
replace score_com = tot5 if sub5=="COM"
replace score_mat = tot5 if sub5=="MAT"
replace score_rige = tot5 if sub5=="RIGE"  //rigzhung
replace score_bio = tot5 if sub5=="BIO"
replace score_his = tot5 if sub5=="HIS"
replace score_geo = tot5 if sub5=="GEO"
replace score_evs = tot5 if sub5=="EVS"
replace score_eco = tot5 if sub5=="ECO"
replace score_med = tot5 if sub5=="MED"  //media studies
replace score_agfs = tot5 if sub5=="AGFS"

replace score_phy = tot6 if sub6=="CHE"
replace score_che = tot6 if sub6=="PHY"
replace score_bmt = tot6 if sub6=="BMT"
replace score_acc = tot6 if sub6=="ACC"
replace score_com = tot6 if sub6=="COM"
replace score_mat = tot6 if sub6=="MAT"
replace score_rige = tot6 if sub6=="RIGE"  //rigzhung
replace score_bio = tot6 if sub6=="BIO"
replace score_his = tot6 if sub6=="HIS"
replace score_geo = tot6 if sub6=="GEO"
replace score_evs = tot6 if sub6=="EVS"
replace score_eco = tot6 if sub6=="ECO"
replace score_med = tot6 if sub6=="MED"  //media studies
replace score_agfs = tot6 if sub6=="AGFS"


lab var score_eng "total score BHSEC English Exam"
lab var score_dzo "total score BHSEC Dzongkhag Exam"
lab var score_phy "total score BHSEC Physics Exam"
lab var score_che "total score BHSEC Chemistry Exam"
lab var score_bmt "total score BHSEC Business Math Exam"
lab var score_acc "total score BHSEC Accountancy Exam"
lab var score_com "total score BHSEC Commerce Exam"
lab var score_mat "total score BHSEC Math Exam"
lab var score_rige "total score BHSEC Rigzhung Elective Exam"
lab var score_bio "total score BHSEC Biology Exam"
lab var score_his "total score BHSEC History Exam"
lab var score_geo "total score BHSEC Geography Exam"
lab var score_evs "total score BHSEC Environmental Science Exam"
lab var score_eco "total score BHSEC Economics Exam"
lab var score_med "total score BHSEC Media Studies Exam"
lab var score_agfs "total score BHSEC Agriculture for Food Security Exam"

* letter grade 
foreach sub in eng dzo {
g score_l`sub' = "A+" if score_`sub'>=91 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "A" if score_`sub'>=81 & score_`sub'<=90 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "B+" if score_`sub'>=71 & score_`sub'<=80 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "B" if score_`sub'>=61 & score_`sub'<=70 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "C+" if score_`sub'>=51 & score_`sub'<=60 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "C" if score_`sub'>=46 & score_`sub'<=50 & ~missing(score_`sub') & passing_`sub'
replace score_l`sub' = "D" if score_`sub'>=40 & score_`sub'<=45 & ~missing(score_`sub') & passing_`sub'
lab var score_l`sub' "letter grade of `sub' in BHSEC"
}

* other subjects are elective and have less observations 
g passing_bhsec = result == "PASS CERTIFICATE AWARDED"
lab var passing_bhsec "1 = BHSEC Pass Certificate Awarded"

rename cidnumber citizenid
replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"

rename contactnumber phone 

g sex = 1 if gender=="M"
replace sex = 0 if gender=="F"

g name = studentname
replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses

replace schoolname = trim(itrim(lower(schoolname)))
replace dateofbirth = subinstr(dateofbirth,"-","",.)

g day = substr(dateofbirth, 7, 2)
g month = substr(dateofbirth, 5, 2)
g year = substr(dateofbirth, 1, 4)

g birthday = day + month + year

rename stream astream
g stream=1 if astream=="ARTS"
replace stream=2 if astream=="COMMERCE"
replace stream=3 if astream=="SCIENCE"

// some schools with variant names in the score database compared to ours
// make sure to correct, either way to match
replace schoolname = "bajo higher secondary school" if schoolname == "bajothang higher secondary school"
replace schoolname = "baylling central school" if schoolname == "bayling central school"
replace schoolname = "dechencholing higher secondary school" if schoolname == "dechhenchoeling higher secondary school"
replace schoolname = "desi higher secondary school" if schoolname == "desi high school"
replace schoolname = "pakshika central school" if schoolname == "pakshikha central school"
replace schoolname = "pelkhil higher secondary school" if schoolname == "pelkhil school"

keep schoolname name phone stream sex citizenid score_* passing_* indexnumber dateofbirth birthday
foreach i in schoolname name phone sex stream citizenid dateofbirth birthday {
    rename `i' `i'_s
}

save $temp/score12.dta, replace 

********************************************************************************
********************************************************************************
// Step 1: matched citizenid + fuzzy name 
********************************************************************************
********************************************************************************

gl criteria1 citizenid  

* open the score data 
use $temp/score12.dta, clear 
rename citizenid_s citizenid 

duplicates tag $criteria1, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/score12_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_s
save $temp/score12_step1d, replace

* open the survey data 
use "$temp/TVET_analysis.dta", clear
drop sex 
rename sex_bl sex
drop name
gen name = subinstr(name_bl," ","",.) // remove space 

gen phone2 = phone_bl 
replace phone2 = phone if missing(phone_bl) | phone_bl == "NA" | phone_bl == "na"
drop phone phone_bl 
rename phone2 phone_bl

g day = substr(dateofbirth, 7, 2)
g month = substr(dateofbirth, 5, 2)
g year = substr(dateofbirth, 1, 4)

g birthday = day + month + year

generate str cid_string = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = cid_string
drop cid_string
describe citizenid

save $temp/data12, replace 

use $temp/data12, clear 

duplicates tag $criteria1, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step1d, replace

* merging two unique data together using fully matched CID and fuzzy name
use $temp/score12_step1u, clear
merge 1:1 $criteria1 using $temp/data12_step1u
matchit name_s name, g(namescore)

*br name* if namescore<0.7 & _merge==3
* correct if names are not matched, leave for the next round of matching
replace _merge = . if inlist(citizenid, "10807002633", "11309000812", "11402000288", "11801002366")

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
save $temp/score12_step1m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score12.dta, keep(matched) nogen 
append using $temp/score12_step1d
save $temp/score12_step2b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step1d
save $temp/data12_step2b, replace 
restore 


********************************************************************************
********************************************************************************
// Step 2: matching schoolname + name + sex + fuzzy date of birth 
********************************************************************************
********************************************************************************

gl criteria2 schoolname_bl name sex 

* open the score dataset 
use $temp/score12_step2b, clear 
cap rename name_s name 
cap rename schoolname_s schoolname_bl 
cap rename sex_s sex 

duplicates tag $criteria2, g(temp)


* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/score12_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename name name_s 
cap rename schoolname_bl schoolname_s 
cap rename sex sex_s 
save $temp/score12_step2d, replace

* open the survey data 
use $temp/data12_step2b.dta, clear
replace name = subinstr(name," ","",.) // remove space 

duplicates tag $criteria2, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step2d, replace

* merging two unique data together using fully matched CID and fuzzy name
clear
use $temp/score12_step2u
merge 1:1 $criteria2 using $temp/data12_step2u
matchit dateofbirth_s dateofbirth, g(birthscore)
matchit birthday_s birthday, g(birthdscore)

replace _merge = . if _merge==3 & ~(birthscore>0.70 | birthdscore>0.70) // allow 1-digit/unit mismatch in date of birth 

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
save $temp/score12_step2m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score12.dta, keep(matched) nogen 
append using $temp/score12_step2d
save $temp/score12_step3b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step2d
save $temp/data12_step3b, replace 
restore 


********************************************************************************
********************************************************************************
// Step 3: schoolname + sex + birth date + fuzzy name 
********************************************************************************
********************************************************************************

gl criteria3 schoolname_bl sex dateofbirth 

use $temp/score12_step3b, clear
rename schoolname_s schoolname_bl 
rename sex_s sex 
rename dateofbirth_s dateofbirth 

duplicates tag $criteria3, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/score12_step3u, replace 
restore 

keep if temp~=0 
drop temp
cap { 
rename schoolname_bl schoolname_s  
rename sex sex_s  
rename dateofbirth dateofbirth_s 
} 
save $temp/score12_step3d, replace 


use $temp/data12_step3b, clear
duplicates tag $criteria3, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/data12_step3u, replace 
restore 

keep if temp~=0 
drop temp 
save $temp/data12_step3d, replace

use $temp/score12_step3u, clear 
merge 1:1 $criteria3 using $temp/data12_step3u
matchit name name_s, g(namescore)

* manual check: 
replace _merge=. if inlist(citizenid_s, "10903000461")

preserve 
keep if _merge==3 
keep uniqueid indexnumber 
save $temp/score12_step3m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score12.dta, keep(matched) nogen 
append using $temp/score12_step3d
save $temp/score12_step4b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step3d
save $temp/data12_step4b, replace 
restore 

********************************************************************************
********************************************************************************
// Step 4: sex + name + fuzzy phone (in case students transfered)
********************************************************************************
********************************************************************************

gl criteria4 sex name  

use $temp/score12_step4b, clear
rename sex_s sex 
rename name_s name 

duplicates tag $criteria4, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/score12_step4u, replace 
restore 

keep if temp~=0 
drop temp
cap {  
rename sex sex_s  
rename name name_s
} 
save $temp/score12_step4d, replace 


use $temp/data12_step4b, clear
duplicates tag $criteria4, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/data12_step4u, replace 
restore 

keep if temp~=0 
drop temp 
save $temp/data12_step4d, replace

use $temp/score12_step4u, clear 
merge 1:1 $criteria4 using $temp/data12_step4u
matchit phone_bl phone_s, g(phonescore)

replace _merge=. if phonescore<0.9 & _merge==3 

* keep 
preserve 
keep if _merge==3 
keep uniqueid indexnumber
save $temp/score12_step4m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score12.dta, keep(matched) nogen 
append using $temp/score12_step4d
save $temp/score12_step5b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step4d
save $temp/data12_step5b, replace 
restore 

********************************************************************************
********************************************************************************
// Step 5: sex + phone + fuzzy name 
********************************************************************************
********************************************************************************

gl criteria5 sex phone_bl 

use $temp/score12_step5b, clear
rename sex_s sex 
rename phone_s phone_bl 

duplicates tag $criteria5, g(temp)

preserve 
keep if temp==0
drop temp
save $temp/score12_step5u, replace 
restore 

keep if temp~=0 
drop temp
cap {  
rename sex sex_s  
rename phone_bl phone_s
} 
save $temp/score12_step5d, replace 


use $temp/data12_step5b, clear
duplicates tag $criteria5, g(temp)
tab temp 

preserve 
keep if temp==0
drop temp
save $temp/data12_step5u, replace 
restore 

keep if temp~=0 
drop temp 
save $temp/data12_step5d, replace

use $temp/score12_step5u, clear 
merge 1:1 $criteria5 using $temp/data12_step5u
matchit name name_s, g(namescore)
matchit dateofbirth dateofbirth_s, g(birthscore)

*br name* namescore if _merge==3
* accept all

* keep 
preserve 
keep if _merge==3 
keep uniqueid indexnumber
save $temp/score12_step5m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score12.dta, keep(matched) nogen 
append using $temp/score12_step5d
save $temp/score12_step6b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step5d
save $temp/data12_step6b, replace 
restore 

// Step 6: sex + birthday + stream + fuzzy name? 
gl criteria6 sex dateofbirth stream 

use $temp/score12_step6b, clear
rename sex_s sex 
rename stream_s stream
rename dateofbirth_s dateofbirth

duplicates tag $criteria6, g(temp)
tab temp

preserve 
keep if temp==0
drop temp
save $temp/score12_step6u, replace 
restore 

keep if temp~=0 
drop temp
cap {  
rename sex sex_s  
rename stream stream_s 
rename dateofbirth dateofbirth_s 
} 
save $temp/score12_step6d, replace 


use $temp/data12_step6b, clear
duplicates tag $criteria6, g(temp)
tab temp 

preserve 
keep if temp==0
drop temp
save $temp/data12_step6u, replace 
restore 

keep if temp~=0 
drop temp 
save $temp/data12_step6d, replace

use $temp/score12_step6u, clear 
merge 1:1 $criteria6 using $temp/data12_step6u
matchit name name_s, g(namescore)

*br name* namescore if _merge==3
replace _merge = . if _merge==3 & namescore<0.9

* keep 
preserve 
keep if _merge==3 
keep uniqueid indexnumber
save $temp/score12_step6m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/score12.dta, keep(matched) nogen 
append using $temp/score12_step6d
save $temp/score12_step7b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step6d
save $temp/data12_step7b, replace 
restore 

* create a new anonymous score dataset to match with analysis dataset 
use $temp/score12_step1m, clear
g note = "criteria: perfect cid + fuzzy name"
append using $temp/score12_step2m
replace note = "criteria: school + name + sex + fuzzy birthday" if missing(note)
append using $temp/score12_step3m
replace note = "criteria: school + sex + birthday + fuzzy name" if missing(note)
append using $temp/score12_step4m
replace note = "criteria: sex + name + fuzzy phone" if missing(note)
append using $temp/score12_step5m
replace note = "criteria: sex + phone + fuzzy name" if missing(note)
append using $temp/score12_step6m
replace note = "criteria: sex + birthday + stream + fuzzy name" if missing(note)
count // 5420

merge 1:1 indexnumber using $temp/score12.dta, keep(matched) nogen 
drop *_s  
tab note
save "$clean/exam_score12.dta", replace 

* clear all temporary datasets 
global tempfilelist: dir "$temp/" files "score12_*.dta"
	foreach tfile of global tempfilelist {
	erase "$temp/`tfile'"
}

global tempfilelist: dir "$temp/" files "data12_*.dta"
	foreach tfile of global tempfilelist {
	erase "$temp/`tfile'"
}